import time
import re
import openai
import requests
import pandas as pd
import multiprocessing

client = openai.OpenAI()

# api_key = os.getenv('OPENAI_API_KEY')

# sys = "You are a textual analyst summarizing the contents of pop song lyrics written mostly in Mandarin Chinese, but sometimes in Cantonese or Taiwanese dialects, sometimes in English, Japanese, or other languages, and sometimes in multiple and mixed languages (like English or Korean words or sentences mixed into Chinese)."

# usr = "For this song lyric, Don't translate, but use exactly one very short English sentence with no more than 20 words to summarize the theme of the song, and then choose three English nouns to name three major emotions in the song lyric. Use this format and remove all brackets: 'Song [replace this with the song's id number]:\nTheme: [replace this with the short theme-summarizing sentence]\nMajor Emotions: [emotion1], [emotion2], [emotion3]'\nHere's the lyric: "

# sys = "You are a textual analyst summarizing the contents of pop song lyrics written mostly in Mandarin Chinese, but sometimes in Cantonese and Taiwanese dialects, and sometimes in multiple and mixed languages (like English or Japanese words or sentences mixed into Chinese)."

# usr = "For this song lyric, (1) use exactly one short sentence to summarize the theme of the song, (2) choose three nouns to name three major emotions of the song lyric, and (3) list ALL the languages/dialects that appear in the lyric, like Mandarin, pinyin, Cantonese, Hokkien, English, Japanese, etc. Use this format: 'Song [id]:\nTheme: [theme sentence]\nEmotions: [emotion1], [emotion2], [emotion3]'\nLanguages: [language 1], [language 2], ...\nHere's the lyric: "

# usr = "For this text, without telling me what the song is about, if you think this is a song lyric, output in this format: 'Song [replace this with song id]: This is a song.' If you think this is not a song lyric, output in this format: 'Song [replace this with song id]: Not a song.'"

sys = "你是一位文本分析师, 擅长概括流行歌曲的歌词里的主题和情感内容. 你将要分析的歌词很可能是由中文普通话写成, 但也可能是古汉语诗词风格, 也可能由粤语或台湾方言写成, 而且中文歌词中还可能夹杂着英文, 日文, 韩文, 或其它语言的片段."

usr = "对下面的歌词, 请先用一个少于40字的中文短句总结它的主题, 然后选择三个汉语词汇(双字词)来准确概括描述该歌词里包含的三种最主要的情感. 描述情感的词汇例如: 悲伤, 思念, 怀旧, 渴望, 期待, 爱恋, 幸福, 激情, 兴奋, 愤怒, 怨恨, 绝望, 犹豫, 羞怯, 坚定, 等等. 请用简体中文按照如下格式将你的总结结果输出为一个字符串 (不要保留方括号): '歌曲编号 [在此输入歌曲编号数字]: \n主题: [在此输入总结歌词主题的短句] \n主要情感: [在此输入情感词1], [在此输入情感词2], [在此输入情感词3]'\n下面是歌词: "

# usr = "对下面的歌词, 请先用一个少于40字的中文短句总结它的主题, 然后选择若干个汉语词汇(双字词)来准确概括描述该歌词里包含的最主要的情感. 描述情感的词汇例如: 悲伤, 思念, 怀旧, 渴望, 期待, 爱恋, 幸福, 激情, 兴奋, 愤怒, 怨恨, 绝望, 犹豫, 羞怯, 坚定, 等等. 请用简体中文按照如下格式将你的总结结果输出为一个字符串 (不要保留方括号): '歌曲编号 [在此输入歌曲编号数字]: \n主题: [在此输入总结歌词主题的短句] \n主要情感: [在此输入情感词1], [在此输入情感词2], ...'\n下面是歌词: "

def sumThemeEmotions(lyric):
    max_retries = 5  # Maximum number of retries
    retry_delay = 2  # Initial delay between retries in seconds
    backoff_factor = 2  # Factor by which the delay increases each time

    for attempt in range(max_retries):
        
        try:
            completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                # model="gpt-4",
                timeout=10,
                messages=[
                    {"role": "system", "content": sys},
                    {"role": "user", "content": usr + str(lyric)}
                ]
            )
            print(completion.choices[0].message.content)
            return completion.choices[0].message.content
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
            retry_delay *= backoff_factor
        except Exception as e:
            print(f"An error occurred: {e}")
            break  # Break the loop if it's an error other than a request failure

    print("Failed to get a response after several retries.")
    return None

def splitThemeEmotion(message):
    # Default values for theme and emotions
    theme, emotions = pd.NA, pd.NA

    message = message.lower()

    if 'theme:' in message:
        parts = message.split('theme:')
        theme_part = parts[1]

        # Further splitting by 'Major Emotions:' if it exists
        if 'major emotions:' in theme_part:
            theme, emotions = theme_part.split('major emotions:', 1)
        else:
            theme = theme_part
    
    # Stripping to remove any leading/trailing whitespace and "\n"
    theme = theme.strip() if pd.notna(theme) else pd.NA
    theme = theme.strip('\n') if pd.notna(theme) else pd.NA
    theme = theme.strip('.') if pd.notna(theme) else pd.NA
    emotions = emotions.strip() if pd.notna(emotions) else pd.NA
    emotions = emotions.strip('\n') if pd.notna(emotions) else pd.NA
    emotions = emotions.strip('.') if pd.notna(emotions) else pd.NA

    return pd.Series([theme, emotions])


def extractID(message):
    # Regular expression to find the first occurrence of consecutive digits
    match = re.search(r'\d+', message)
    if match:
        return int(match.group())
    else:
        return pd.NA  # Return None if no digits are found


######################################## First Time Generate Themes and Emotions for All 300,000 Songs #######################################################################################################
if __name__ == "__main__":

    for i in range(24, 30):

        df = pd.read_csv(f'Final Datasets/7.csv segments/ch_df7_narrow {i}.csv', low_memory=False)
        df['id_lyric'] = '歌曲编号 ' + df['id2'].astype(str) + ': ' + df['cleaned_lyrics'].astype(str)
        df = df['id_lyric']

        sample_lyrics = [value for value in df.values]

        start = time.time()

        # results = []

        # for lyric in sample_lyrics:
        #   results.append(sumThemeEmotions(lyric))

        with multiprocessing.Pool(processes=6) as pool:
            print('processing')
            results = pool.map(sumThemeEmotions, sample_lyrics)

        # total runtime of for loop
        runTime = time.time() - start
        print(str(runTime) + ' seconds')

        df = pd.DataFrame(results, columns=['GPT_Message'])
        df['id2'] = range(1, len(df) + 1)
        df['id2'] = df['id2'] + (i-1)*10000
        df = df[['id2', 'GPT_Message']]

        df.to_csv(f'Final Datasets/7.csv segments/gpt_messages_ch {i}.csv', index=False)

################################### Second Time to Regenerate Themes and Emotions for Those Themes that are too long or No correct "Theme:" or "Major Emotions:" prefixes ##################################################################################################################################

# if __name__ == "__main__":
#     df = pd.read_csv('Final Datasets/7.csv segments/df_resend.csv', low_memory=False)
#     df['id_lyric'] = '歌曲编号 ' + df['id2'].astype(str) + ': ' + df['cleaned_lyrics'].astype(str)
   
#     df = df['id_lyric']
#     # df = df['id_lyric'].sample(n = 1000)

#     sample_lyrics = [value for value in df.values]

#     start = time.time()

#     with multiprocessing.Pool(processes=12) as pool:
#         print('processing')
#         results = pool.map(sumThemeEmotions, sample_lyrics)

#     # total runtime
#     runTime = time.time() - start
#     print(str(runTime) + ' seconds')

#     df = pd.DataFrame(results, columns=['GPT_Message'])

#     df.to_csv('Final Datasets/7.csv segments/df_resent.csv', index=False)

####################################### Keep regenerating Themes and Emotions until the max length of GPT_Message is less than 181 ###############################################################################


# df_hasEmoShort = pd.read_csv('Final Datasets/7.csv segments/df_hasEmoShort.csv', low_memory=False)
# df_regen = pd.read_csv('Final Datasets/7.csv segments/regenerated_GPT_Messages.csv', low_memory=False)
# df = pd.read_csv('Final Datasets/FinalMergedDataset 7 narrow.csv', low_memory=False)

# for i in range(0, 3):

#     # Applying the function to the 'GPT_Message' column
#     df_regen['id2'] = df_regen['GPT_Message'].apply(extractID)
#     df_regen = df_regen[['id2','GPT_Message']]

#     # Applying the function and creating new columns
#     df_regen[['theme', 'emotions']] = df_regen['GPT_Message'].apply(splitThemeEmotion)

#     df_regen['messLen'] = df_regen['GPT_Message'].str.len()
#     df_regen = df_regen.sort_values(by='messLen', ascending=False).reset_index(drop=True)

#     print('max length = ' + str(df_regen['messLen'][0]))

#     if df_regen['messLen'][0] < 181:
#         break

#     df_regen_noEmo = df_regen[pd.isna(df_regen['emotions'])]
#     df_regen_hasEmo = df_regen[pd.notna(df_regen['emotions'])]

#     df_regen_hasEmoLong = df_regen_hasEmo[df_regen_hasEmo['messLen'] > 180]
#     df_regen_hasEmoShort = df_regen_hasEmo[df_regen_hasEmo['messLen'] <= 180]

#     df_hasEmoShort = pd.concat([df_hasEmoShort, df_regen_hasEmoShort], ignore_index=True)
#     df_regen_resend = pd.concat([df_regen_noEmo, df_regen_hasEmoLong], ignore_index=True)

#     df_resent = df[df['id2'].isin(df_regen_resend['id2'])]

#     if __name__ == "__main__":

#         df_resent['id_lyric'] = 'song ' + df_resent['id2'].astype(str) + ': ' + df_resent['cleaned_lyrics'].astype(str)
#         df_resent = df_resent['id_lyric']

#         sample_lyrics = [value for value in df_resent.values]

#         start = time.time()

#         with multiprocessing.Pool(processes=12) as pool:
#             print('processing')
#             results = pool.map(sumThemeEmotions, sample_lyrics)

#         # total runtime
#         runTime = time.time() - start
#         print(str(runTime) + ' seconds')

#         df_regen = pd.DataFrame(results, columns=['GPT_Message'])

# df_regen.to_csv('Final Datasets/7.csv segments/regenerated_GPT_Messages_end.csv', index=False)
# df_hasEmoShort.to_csv('Final Datasets/7.csv segments/df_hasEmoShort_end.csv', index=False)

################################### Testing the consistency of 3 emotions (summarized by GPT) being the median and mean number among 1000 randomly sampled songs ##################################################################################################################################

# if __name__ == "__main__":
#     df = pd.read_csv('Final Datasets/df7_ch_split_emo scores.csv', low_memory=False)
#     df['id_lyric'] = '歌曲编号 ' + df['id'].astype(str) + ': ' + df['ch'].astype(str)
   
  
#     df = df['id_lyric'].sample(n = 1000)

#     sample_lyrics = [value for value in df.values]

#     start = time.time()

#     with multiprocessing.Pool(processes=12) as pool:
#         print('processing')
#         results = pool.map(sumThemeEmotions, sample_lyrics)

#     # total runtime
#     runTime = time.time() - start
#     print(str(runTime) + ' seconds')

#     df = pd.DataFrame(results, columns=['GPT_Message'])

#     df.to_csv('Final Datasets/7.csv segments/test.csv', index=False)

###############################################################################################################################################################################

